This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#install.packages("tidytext")
library(gutenbergr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ readr 2.1.3
## ✔ tibble 3.1.8 ✔ purrr 0.3.5
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
gutenberg_metadata <- gutenberg_metadata
treasure_island <- gutenberg_metadata %>% filter(str_detect(gutenberg_metadata$title,"Treasure Island" ) & str_detect(gutenberg_metadata$author, "Stevenson, Robert Louis"))
kidnapped <- gutenberg_metadata %>% filter(str_detect(gutenberg_metadata$title,"Kidnapped" ) & str_detect(gutenberg_metadata$author, "Stevenson, Robert Louis"))
treasure_island_id <- treasure_island$gutenberg_id
kidnapped_id <- kidnapped$gutenberg_id
#Gutenberg IDs of Treasure Island and Kidnapped
cat("Gutenberg IDs of Treasure Island: ", treasure_island_id, "\n")
## Gutenberg IDs of Treasure Island: 120 23936 27780
cat("Gutenberg IDs of Kidnapped: ", kidnapped_id, "\n")
## Gutenberg IDs of Kidnapped: 421 56562
##Q2 Download the texts of these two books from the gutenberg package.
#Download Treasure Island
treasure_island_book <- gutenberg_works(title == "Treasure Island", author == "Stevenson, Robert Louis")[1]
treasure_island_book_text <- gutenberg_download(treasure_island_book$gutenberg_id)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#Download Kidnapped
kidnapped_book <- gutenberg_works(title == "Kidnapped", author == "Stevenson, Robert Louis")[1]
kidnapped_book_text <- gutenberg_download(kidnapped_book$gutenberg_id)
##Q3 Find the 10 most common words (that are not stop words) in each novel.
#tokenizing the text
treasure_island_words <- treasure_island_book_text %>% unnest_tokens(word,text)
kidnapped_words <- kidnapped_book_text %>% unnest_tokens(word,text)
#word counts after removing stop words
treasure_island_filtered_wordcount <- treasure_island_words %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
kidnapped_filtered_wordcount <- kidnapped_words %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
#Top 10 words
top10_treasure_island <- treasure_island_filtered_wordcount %>% head(10)
top10_kidnapped <- kidnapped_filtered_wordcount %>% head(10)
cat("TOP 10 words in Treasure Island")
## TOP 10 words in Treasure Island
top10_treasure_island$word
## [1] "captain" "silver" "doctor" "time" "hand" "sea" "hands"
## [8] "i’ll" "cried" "sir"
cat("TOP 10 words in Kidnapped")
## TOP 10 words in Kidnapped
top10_kidnapped$word
## [1] "alan" "ye" "house" "time" "set" "hand" "sir"
## [8] "cried" "day" "country"
# Combine top100 word counts for both books into a single data frame
#(used top 50 only for readability purpose: can do for whole dataframe directly using <book>_filtered_wordcount )
top100_treasure_island <- treasure_island_filtered_wordcount %>% head(50)
top100_kidnapped <- kidnapped_filtered_wordcount %>% head(50)
word_counts <- bind_rows(
mutate(top100_treasure_island, book = "Treasure Island"),
mutate(top100_kidnapped, book = "Kidnapped")
)
# Calculate proportion of each word in each book
book_word_proportions <- word_counts %>%
group_by(book) %>%
mutate(total_words = sum(n),
proportion = n / total_words) %>%
ungroup()
# Create scatterplot of word proportions
proportion_plot <- ggplot(book_word_proportions, aes(x = proportion, y = reorder(word, -proportion), color = book)) +
geom_point() +
#scale_x_continuous(labels = scales::percent_format()) +
labs(x = "Proportion of Non-Stop Words", y = "Word", color = "Book")
ggplotly()
# Cleaning up to retain only words and removing stop words
frequency <- bind_rows(mutate(treasure_island_words, book = "Treasure Island"),
mutate(kidnapped_words, book = "Kidnapped")) %>%
mutate(word = str_extract(word, "[a-z]+")) %>% anti_join(stop_words)
## Joining, by = "word"
# Calculate proportion of words used by both authors
frequency <- frequency %>%
count(book, word) %>%
group_by(book) %>%
mutate(proportion = n / sum(n)) %>%
select(-n)
# Creating separate author columns for comparison
frequency <- frequency %>%
pivot_wider(names_from = "book", values_from = "proportion")
head(frequency,5)
## # A tibble: 5 × 3
## word Kidnapped `Treasure Island`
## <chr> <dbl> <dbl>
## 1 ab 0.0000401 NA
## 2 aback 0.0000401 0.000134
## 3 abandoned 0.0000401 0.0000896
## 4 abashed 0.000120 NA
## 5 abate 0.0000401 NA
cor_plot<-ggplot(frequency, aes(x = `Treasure Island`, y = `Kidnapped`)) +
geom_abline(color = "red", lty = 2, lwd=2) +
geom_point(color="blue")+
geom_text(aes(label = word), check_overlap = TRUE) +
scale_x_log10() + scale_y_log10()
ggplotly()
frequency %>%
filter(!(`Treasure Island`=="NA"|`Kidnapped`=="NA")) %>%
select(,2:3) %>%
cor()
## Kidnapped Treasure Island
## Kidnapped 1.0000000 0.4429402
## Treasure Island 0.4429402 1.0000000
##Find two words that appear with a high frequency in Kidnapped but not in Treasure Island.
##Find two words that appear with a high frequency in Treasure Island but not in Kidnapped.
##Find two words that appear with high frequency in both novels.
common_bigrams <- treasure_island_book_text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(bigram != "NA")%>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep=" ") %>%
count(bigram, sort = TRUE)%>%head(10)
common_bigrams
## # A tibble: 10 × 2
## bigram n
## <chr> <int>
## 1 dr livesey 38
## 2 ben gunn 31
## 3 captain smollett 29
## 4 spy glass 24
## 5 black dog 19
## 6 block house 17
## 7 admiral benbow 15
## 8 cried silver 15
## 9 john silver 15
## 10 log house 14
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.